In [124]:
import pandas as pd
import plotly.offline as pyo
import plotly.graph_objs as go
# Set notebook mode to work in offline
pyo.init_notebook_mode()
In [110]:
df1 = pd.read_csv("Doctorate_students_2022-04-26-ta_resdig_sctech_rdperes_perf__tsc00028.csv")
In [111]:
df1 = df1.dropna()
In [112]:
df1_spain = df1[df1.geo == "Spain"]
df1_spain
Out[112]:
time _geo geo value unit _unit isced97 _isced97 field _field sex _sex _flag
92 2005 ES Spain 0.2 Percentage of total population aged 20-29 PC_Y20-29 Second stage of tertiary education leading to ... ED6 Science, mathematics and computing, engineerin... EF4_5 Women W b
394 2005 ES Spain 0.3 Percentage of total population aged 20-29 PC_Y20-29 Second stage of tertiary education leading to ... ED6 Science, mathematics and computing, engineerin... EF4_5 Men M b
699 2005 ES Spain 0.3 Percentage of total population aged 20-29 PC_Y20-29 Second stage of tertiary education leading to ... ED6 Science, mathematics and computing, engineerin... EF4_5 Total T b
In [123]:
import plotly.express as px

fig = px.scatter(df1_spain, x="value", y="unit", animation_frame="time", animation_group="geo",
           size="value", color="sex", hover_name="geo", facet_col="geo",
           size_max=45)
fig.show()
In [114]:
df_prop_gender_sci = pd.read_csv("prop_gender_science.csv")
df_prop_gender_sci
Out[114]:
time _geo geo value sex _sex unit _unit
0 2012 AT Austria 7.5 Men M Percentage PC
1 2012 AT Austria 12.2 Women W Percentage PC
2 2012 BE Belgium 9.5 Men M Percentage PC
3 2012 BE Belgium 14.7 Women W Percentage PC
4 2012 BG Bulgaria 2.4 Men M Percentage PC
... ... ... ... ... ... ... ... ...
127 2016 SI Slovenia 7.1 Women W Percentage PC
128 2016 SK Slovakia 9.3 Men M Percentage PC
129 2016 SK Slovakia 12.0 Women W Percentage PC
130 2016 UK United Kingdom 1.8 Men M Percentage PC
131 2016 UK United Kingdom 5.1 Women W Percentage PC

132 rows × 8 columns

In [267]:
import plotly.express as px

fig = px.scatter(df_prop_gender_sci, x="geo", y="value", facet_col="time",
           size="value", color="sex", color_discrete_sequence=["#EF7949", "#83539D"])
fig.show()
In [132]:
df_total_personnel_in_rd = pd.read_csv("total_personnel_science_in_rd_by_gender.csv")
df_total_personnel_in_rd = df_total_personnel_in_rd[df_total_personnel_in_rd.unit == "Full-time equivalent (FTE)"]
df_total_personnel_in_rd = df_total_personnel_in_rd.dropna(subset=["value"])
df_total_personnel_in_rd = df_total_personnel_in_rd.sort_values(by='time')
df_total_personnel_in_rd = df_total_personnel_in_rd[df_total_personnel_in_rd.sectperf == "All sectors"]
df_total_personnel_in_rd = df_total_personnel_in_rd[df_total_personnel_in_rd.prof_pos == "Total"]
df_total_personnel_in_rd
Out[132]:
time _geo geo value sectperf _sectperf prof_pos _prof_pos sex _sex unit _unit _flag
53130 1980 HU Hungary 25589.0 All sectors TOTAL Total TOTAL Total T Full-time equivalent (FTE) FTE NaN
52976 1980 ES Spain 35496.0 All sectors TOTAL Total TOTAL Total T Full-time equivalent (FTE) FTE NaN
52708 1981 BE Belgium 32531.0 All sectors TOTAL Total TOTAL Total T Full-time equivalent (FTE) FTE e
53131 1981 HU Hungary 51512.0 All sectors TOTAL Total TOTAL Total T Full-time equivalent (FTE) FTE d
53211 1981 IS Iceland 744.0 All sectors TOTAL Total TOTAL Total T Full-time equivalent (FTE) FTE NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ...
53376 2020 MK North Macedonia 2029.0 All sectors TOTAL Total TOTAL Total T Full-time equivalent (FTE) FTE NaN
53467 2020 NO Norway 48947.0 All sectors TOTAL Total TOTAL Total T Full-time equivalent (FTE) FTE NaN
53494 2020 PL Poland 173392.0 All sectors TOTAL Total TOTAL Total T Full-time equivalent (FTE) FTE NaN
53435 2020 NL Netherlands 161564.0 All sectors TOTAL Total TOTAL Total T Full-time equivalent (FTE) FTE p
53395 2020 MT Malta 1807.0 All sectors TOTAL Total TOTAL Total T Full-time equivalent (FTE) FTE p

1562 rows × 13 columns

In [133]:
df_total_personnel_in_rd.geo.unique(), df_total_personnel_in_rd.sex.unique()
Out[133]:
(array(['Hungary', 'Spain', 'Belgium', 'Iceland', 'Sweden', 'Finland',
        'United Kingdom', 'Italy', 'Netherlands', 'Germany', 'Norway',
        'Denmark', 'Austria', 'France', 'Ireland', 'Portugal',
        'Switzerland', 'Greece', 'Turkey', 'Cyprus', 'Slovenia',
        'Bulgaria', 'Romania', 'Latvia', 'Russia', 'Slovakia', 'Poland',
        'Czechia', 'Lithuania', 'Estonia', 'Malta',
        'Euro area (19 countries)', 'Luxembourg',
        'European Union - 27 countries (from 2020)', 'Croatia',
        'North Macedonia', 'Serbia', 'Montenegro',
        'Bosnia and Herzegovina'], dtype=object),
 array(['Total', 'Women'], dtype=object))
In [143]:
def filter_by_if_year_has_total_and_women_get_percentage(df_total_personnel_in_rd, country_name):
    df_total_personnel_in_rd_filt = df_total_personnel_in_rd[df_total_personnel_in_rd.geo == country_name].sort_values("time")

    years_with_data =  df_total_personnel_in_rd_filt.time.value_counts() > 1
    years_with_data = years_with_data.index[years_with_data]
    
    #print(df_total_personnel_in_rd_filt.tail(20).values)

    df_total_personnel_in_rd_filt = df_total_personnel_in_rd_filt[df_total_personnel_in_rd_filt.time.isin(years_with_data)]

    total_year = {}
    for row in df_total_personnel_in_rd_filt.values:
        if row[8] == "Total":
            total_year[row[0]] = row[3]
            
    print(country_name)
    if df_total_personnel_in_rd_filt.empty:
        print(df_total_personnel_in_rd_filt, years_with_data)
        return

    df_total_personnel_in_rd_filt["percent"] = df_total_personnel_in_rd_filt.apply(lambda x: x["value"]/total_year[x["time"]], axis=1)
    return df_total_personnel_in_rd_filt

df_cleaned = pd.DataFrame()
for country  in df_total_personnel_in_rd.geo.unique():
    df_country_cleaned = filter_by_if_year_has_total_and_women_get_percentage(df_total_personnel_in_rd, country)
    df_cleaned = pd.concat([df_cleaned, df_country_cleaned])
    
df_cleaned
Hungary
Spain
Belgium
Iceland
Sweden
Finland
Empty DataFrame
Columns: [time, _geo, geo, value, sectperf, _sectperf, prof_pos, _prof_pos, sex, _sex, unit, _unit, _flag]
Index: [] Int64Index([], dtype='int64')
United Kingdom
Empty DataFrame
Columns: [time, _geo, geo, value, sectperf, _sectperf, prof_pos, _prof_pos, sex, _sex, unit, _unit, _flag]
Index: [] Int64Index([], dtype='int64')
Italy
Netherlands
Germany
Norway
Empty DataFrame
Columns: [time, _geo, geo, value, sectperf, _sectperf, prof_pos, _prof_pos, sex, _sex, unit, _unit, _flag]
Index: [] Int64Index([], dtype='int64')
Denmark
Austria
France
Ireland
Portugal
Switzerland
Empty DataFrame
Columns: [time, _geo, geo, value, sectperf, _sectperf, prof_pos, _prof_pos, sex, _sex, unit, _unit, _flag]
Index: [] Int64Index([], dtype='int64')
Greece
Turkey
Cyprus
Slovenia
Bulgaria
Romania
Latvia
Russia
Empty DataFrame
Columns: [time, _geo, geo, value, sectperf, _sectperf, prof_pos, _prof_pos, sex, _sex, unit, _unit, _flag]
Index: [] Int64Index([], dtype='int64')
Slovakia
Poland
Czechia
Lithuania
Estonia
Malta
Euro area (19 countries)
Empty DataFrame
Columns: [time, _geo, geo, value, sectperf, _sectperf, prof_pos, _prof_pos, sex, _sex, unit, _unit, _flag]
Index: [] Int64Index([], dtype='int64')
Luxembourg
European Union - 27 countries (from 2020)
Empty DataFrame
Columns: [time, _geo, geo, value, sectperf, _sectperf, prof_pos, _prof_pos, sex, _sex, unit, _unit, _flag]
Index: [] Int64Index([], dtype='int64')
Croatia
North Macedonia
Serbia
Montenegro
Bosnia and Herzegovina
Out[143]:
time _geo geo value sectperf _sectperf prof_pos _prof_pos sex _sex unit _unit _flag percent
53156 2006 HU Hungary 25971.0 All sectors TOTAL Total TOTAL Total T Full-time equivalent (FTE) FTE NaN 1.000000
51763 2006 HU Hungary 10797.0 All sectors TOTAL Total TOTAL Women W Full-time equivalent (FTE) FTE NaN 0.415733
53157 2007 HU Hungary 25954.0 All sectors TOTAL Total TOTAL Total T Full-time equivalent (FTE) FTE NaN 1.000000
51764 2007 HU Hungary 10504.0 All sectors TOTAL Total TOTAL Women W Full-time equivalent (FTE) FTE NaN 0.404716
51765 2008 HU Hungary 10985.0 All sectors TOTAL Total TOTAL Women W Full-time equivalent (FTE) FTE NaN 0.400869
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
52705 2013 BA Bosnia and Herzegovina 1399.0 All sectors TOTAL Total TOTAL Total T Full-time equivalent (FTE) FTE NaN 1.000000
52706 2014 BA Bosnia and Herzegovina 1767.0 All sectors TOTAL Total TOTAL Total T Full-time equivalent (FTE) FTE NaN 1.000000
51577 2014 BA Bosnia and Herzegovina 857.0 All sectors TOTAL Total TOTAL Women W Full-time equivalent (FTE) FTE NaN 0.485003
52707 2019 BA Bosnia and Herzegovina 2037.0 All sectors TOTAL Total TOTAL Total T Full-time equivalent (FTE) FTE NaN 1.000000
51578 2019 BA Bosnia and Herzegovina 1081.0 All sectors TOTAL Total TOTAL Women W Full-time equivalent (FTE) FTE NaN 0.530682

980 rows × 14 columns

In [361]:
country_dict = {"Bosnia and Herzegovina":"Bosnia y Herzegovina", 
"Spain":"España",
"Belgium":"Bélgica",
"Cyprus":"Chipre",
"Czechia":"República Checa",
"France":"Francia",
"Germany":"Alemania",
"Denmark":"Dinamarca",
"Greece":"Grecia",
"Croatia":"Croacia",
"Hungary":"Hungaria",
"Ireland":"Irlanda",
"Iceland":"Islandia",
"Italy":"Italia", 
"Lithuania":"Lituania", 
"Luxembourg":"Luxemburgo", 
"Latvia":"Letonia",
"North Macedonia":"Macedonia del Norte",
"Netherlands":"Paises Bajos",
"Poland":"Polonia",
"Romania":"Rumanía",
"Sweden":"Suecia",
"Norway":"Noruega",
"Slovenia":"Eslovenia",
"Slovakia":"Eslovaquia",
"Turkey":"Turquía",
"Switzerland":"Suiza"}

def convert_country_name(country_name):
    translated_country = None
    if country_name in country_dict.keys():
        translated_country = country_dict[country_name]
    else: 
        translated_country = country_name
    
    return translated_country
        

df_cleaned.geo = df_cleaned.geo.apply(convert_country_name)
In [154]:
avg_women_percent_by_year = {}
for year in df_cleaned.time.unique():
    avg_women_percent_by_year[year] = df_cleaned[(df_cleaned.sex=="Women") & 
                                                 (df_cleaned.time==year)]["value"].sum() / \
                                      df_cleaned[(df_cleaned.sex=="Total") & 
                                                 (df_cleaned.time==year)]["value"].sum()
        
avg_women_percent_by_year
Out[154]:
{2006: 0.364376751262983,
 2007: 0.3238009195484552,
 2008: 0.37567710405520155,
 2009: 0.3318481072479871,
 2010: 0.33348615226354544,
 2011: 0.32244098509597036,
 2012: 0.3422411713648949,
 2013: 0.324851388660272,
 2014: 0.3511998112391364,
 2015: 0.3302487361180176,
 2016: 0.3546968158407643,
 2017: 0.3269165733273229,
 2018: 0.35565858451639415,
 2019: 0.3276984680329024,
 1997: 0.26904255770150864,
 1999: 0.28791480277447107,
 2001: 0.3066278856677181,
 2002: 0.36760359979190455,
 2003: 0.3103814777719663,
 2004: 0.36246515363898385,
 2005: 0.3241359354453265,
 2000: 0.4850806901276067,
 1998: 0.4192304889601399,
 2020: 0.3504186422865714,
 1994: 0.4722347718478069,
 1995: 0.4811903325407247,
 1996: 0.4762487366349274,
 1993: 0.5367495072826035}
In [362]:
fig = px.scatter(df_cleaned[(df_cleaned.sex=="Women") & (df_cleaned.time>2000)], x="geo", y="percent", 
            color="time",  hover_name="sex", height=700,
            labels={
                 "geo": "Paises",
                 "time": "Año",
                 "percent": "Porcentaje"
             })
fig.write_html("women_europe_rd.html")
fig.show()
In [324]:
fig = px.scatter(df_cleaned[(df_cleaned.sex=="Women") 
                            & (df_cleaned.time>2000) & 
                            (df_cleaned.geo=="Spain")], x="time", y="percent", height=400, color="time",
                            labels={
                                 "geo": "Paises",
                                 "time": "Año",
                                 "percent": "Porcentaje"
                             },
                color_discrete_sequence=["#83539D", "#EF7949"])
fig.write_html("women_spain_rd.html")
fig.show()
In [196]:
df_investigators = pd.read_csv("carrera_investigadora.csv")
df_investigators.iloc[3, 0] = 'Tesis aprobadas'
df_investigators.iloc[1, 0] = 'Estudios de máster'
df_investigators
Out[196]:
Nivel de la carrera investigadora Mujeres Hombres
0 Estudios de grado y primer y segundo ciclo 55.2 44.8
1 Estudios de máster 54.4 45.5
2 Esudios de doctorado 50.0 50.0
3 Tesis aprobadas 49.8 50.2
4 Grado D 47.6 52.4
5 Grado C 50.0 50.0
6 Grado B 44.5 55.5
7 Grado A 24.1 75.9
In [197]:
df_investigators = df_investigators.melt(id_vars=["Nivel de la carrera investigadora"], 
                                         value_vars=["Mujeres", "Hombres"], 
                                         var_name="Sexo", 
                                         value_name="Porcentaje")
df_investigators
Out[197]:
Nivel de la carrera investigadora Sexo Porcentaje
0 Estudios de grado y primer y segundo ciclo Mujeres 55.2
1 Estudios de máster Mujeres 54.4
2 Esudios de doctorado Mujeres 50.0
3 Tesis aprobadas Mujeres 49.8
4 Grado D Mujeres 47.6
5 Grado C Mujeres 50.0
6 Grado B Mujeres 44.5
7 Grado A Mujeres 24.1
8 Estudios de grado y primer y segundo ciclo Hombres 44.8
9 Estudios de máster Hombres 45.5
10 Esudios de doctorado Hombres 50.0
11 Tesis aprobadas Hombres 50.2
12 Grado D Hombres 52.4
13 Grado C Hombres 50.0
14 Grado B Hombres 55.5
15 Grado A Hombres 75.9
In [327]:
import plotly.express as px

fig = px.line(df_investigators, x="Nivel de la carrera investigadora", y="Porcentaje",
             color='Sexo',
             height=500, color_discrete_sequence=["#83539D", "#EF7949"])
fig.write_html("genero_investigadoras.html")
fig.show()
In [225]:
df_computer_use = pd.read_csv("2022-05-02-ta_resdig_dig_comp__isoc_ci_cfp_cu_computer use male.csv")
#df_computer_use_female = pd.read_csv("2022-05-02-ta_resdig_dig_comp__isoc_ci_cfp_cu_comuter use women.csv")

df_computer_use
Out[225]:
time _geo geo value unit _unit indic_is _indic_is ind_type _ind_type _flag
0 2010 AT Austria 82.0 Percentage of individuals PC_IND Last computer use: within last 3 months I_C3 Individuals who are born in another EU Member ... CB_EU_FOR NaN
1 2011 AT Austria 75.0 Percentage of individuals PC_IND Last computer use: within last 3 months I_C3 Individuals who are born in another EU Member ... CB_EU_FOR NaN
2 2012 AT Austria 89.0 Percentage of individuals PC_IND Last computer use: within last 3 months I_C3 Individuals who are born in another EU Member ... CB_EU_FOR NaN
3 2013 AT Austria 83.0 Percentage of individuals PC_IND Last computer use: within last 3 months I_C3 Individuals who are born in another EU Member ... CB_EU_FOR NaN
4 2014 AT Austria 81.0 Percentage of individuals PC_IND Last computer use: within last 3 months I_C3 Individuals who are born in another EU Member ... CB_EU_FOR NaN
... ... ... ... ... ... ... ... ... ... ... ...
184577 2008 UK United Kingdom 70.0 Percentage of individuals PC_IND Computer use: never I_CUX Individuals, 75 years old or more Y75_MAX NaN
184578 2009 UK United Kingdom 72.0 Percentage of individuals PC_IND Computer use: never I_CUX Individuals, 75 years old or more Y75_MAX NaN
184579 2010 UK United Kingdom 66.0 Percentage of individuals PC_IND Computer use: never I_CUX Individuals, 75 years old or more Y75_MAX NaN
184580 2011 UK United Kingdom 60.0 Percentage of individuals PC_IND Computer use: never I_CUX Individuals, 75 years old or more Y75_MAX NaN
184581 2012 UK United Kingdom 58.0 Percentage of individuals PC_IND Computer use: never I_CUX Individuals, 75 years old or more Y75_MAX NaN

184582 rows × 11 columns

In [226]:
df_computer_use.indic_is.unique(), df_computer_use.ind_type.unique()
Out[226]:
(array(['Last computer use: within last 3 months', 'Computer use: never',
        'Last computer use: between 3 and 12 months ago',
        'Individuals who have ever used a computer',
        'Last computer use: within last 12 months',
        'Individuals who used a computer more than a year ago'],
       dtype=object),
 array(['Individuals who are born in another EU Member State',
        'Non-nationals', 'Individuals who are born in non-EU country',
        'Individuals who are foreign-born',
        'Individuals who are native-born',
        'Nationals of another EU-Member State',
        'Nationals of non-EU country', 'Nationals',
        'Active labour force (employed and unemployed)',
        'Females with low formal education',
        'Females with medium formal education',
        'Females with high formal education',
        'Females, 16 to 19 years old', 'Females, 16 to 24 years old',
        'Females, 16 to 29 years old', 'Females, 16 to 74 years old',
        'Females, 20 to 24 years old', 'Females, 25 to 29 years old',
        'Females 25 to 54 years old', 'Females, 25 to 64 years old',
        'Females 55 to 74 years old',
        'Individual living in a household with income in first quartile',
        'Individual living in a household with income in second quartile',
        'Individual living in a household with income in third quartile',
        'Individual living in a household with income in fourth quartile',
        'Individuals with no or low formal education',
        'Individuals with medium formal education',
        'Individuals with high formal education',
        'Individuals living in a household with broadband access',
        'Individuals living in a household with Internet access but with no broadband access',
        'Individuals living in a household with children',
        'Individuals living in densely-populated area (at least 500 inhabitants/Km²)',
        'Individuals living in intermediate urbanized area (between 100 and 499 inhabitants/Km²)',
        'Individuals living in sparsely populated area (less than 100 inhabitants/Km²)',
        "Individuals living in Not Objective 1 regions / 'Regional Competitiveness and Employment' Region",
        'Individuals living in a household without children',
        "Individuals living in Objective 1 regions / 'Convergence' regions",
        'All Individuals', 'Non-manual including the armed forces',
        'Manual', 'ICT professionals', 'Non ICT professionals',
        'Mobile internet users', 'Non-users of mobile internet',
        'Males with low formal education',
        'Males with medium formal education',
        'Males with high formal education', 'Males, 16 to 19 years old',
        'Males, 16 to 24 years old', 'Males, 16 to 29 years old',
        'Males, 16 to 74 years old', 'Males, 20 to 24 years old',
        'Males, 25 to 29 years old', 'Males 25 to 54 years old',
        'Males, 25 to 64 years old', 'Males 55 to 74 years old',
        'Retired Individuals', 'Retired and other inactive',
        'Individuals with at least one of the 3 following characteristics: 55 to 74 years old; low education; unemployed or inactive or retired',
        'Individuals with at least two of the 3 following characteristics: 55 to 74 years old; low education; unemployed or inactive or retired',
        'Employees', 'Employees, self-employed, family workers',
        'Self-employed, family workers', 'Students', 'Unemployed',
        'Individuals, 15 years old or less',
        'Individuals, 16 to 19 years old',
        'Individuals, 16 to 24 years old',
        'Individuals aged 16-24 with high formal education',
        'Individuals aged 16-24 with low education',
        'Individuals aged 16-24 with medium formal education',
        'Individuals, 16 to 29 years old',
        'Individuals aged 16-29 with high formal education',
        'Individuals aged 16-29 with low formal education',
        'Individuals aged 16-29 with medium formal education',
        'Individuals, 20 to 24 years old',
        'Individuals, 25 to 29 years old',
        'Individuals, 25 to 34 years old',
        'Individuals, 25 to 54 years old',
        'Individuals aged 25 to 54 with high formal education',
        'Individuals aged 25 to 54 with low formal education',
        'Individuals aged 25 to 54 with medium formal education',
        'Individuals, 25 to 64 years old',
        'Individuals aged 25 to 64 with high formal education',
        'Individuals aged 25 to 64 with low formal education',
        'Individuals aged 25 to 64 with medium formal education',
        'Individuals aged 25 to 64 who are in the active labour force (employed and unemployed)',
        'Individuals aged 25 to 64 who are retired or other inactive',
        'Individuals aged 25 to 64 who are employees, self-employed or family workers',
        'Individuals aged 25 to 64 who are unemployed',
        'Individuals, 35 to 44 years old',
        'Individuals, 45 to 54 years old',
        'Individuals, 55 to 64 years old',
        'Individuals, 55 to 74 years old',
        'Individuals aged 55 to 74 with high formal education',
        'Individuals aged 55 to 74 with low formal education',
        'Individuals aged 55 to 74 with medium formal education',
        'Individuals, 65 to 74 years old',
        'Individuals, 75 years old or more'], dtype=object))
In [336]:
df_computer_use_male = df_computer_use[(df_computer_use.ind_type.str.contains("Males")) & 
                     (df_computer_use.ind_type.str.contains("years old")) & 
                     (df_computer_use.indic_is=="Last computer use: within last 3 months")]
df_computer_use_male["Sexo"] = "Hombre"

df_computer_use_female = df_computer_use[(df_computer_use.ind_type.str.contains("Females")) & 
                     (df_computer_use.ind_type.str.contains("years old")) & 
                     (df_computer_use.indic_is=="Last computer use: within last 3 months")]
df_computer_use_female["Sexo"] = "Mujer"

computer_use_cleaned = pd.concat([df_computer_use_male, df_computer_use_female])
computer_use_cleaned["ind_type_new"] = computer_use_cleaned._ind_type.str.slice(start=-5, stop=8).str.replace("_", "-")
computer_use_cleaned
/tmp/ipykernel_727/3118441712.py:4: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/tmp/ipykernel_727/3118441712.py:9: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[336]:
time _geo geo value unit _unit indic_is _indic_is ind_type _ind_type _flag Sexo ind_type_new
15342 2011 AT Austria 99.0 Percentage of individuals PC_IND Last computer use: within last 3 months I_C3 Males, 16 to 19 years old M_Y16_19 NaN Hombre 16-19
15343 2012 AT Austria 100.0 Percentage of individuals PC_IND Last computer use: within last 3 months I_C3 Males, 16 to 19 years old M_Y16_19 NaN Hombre 16-19
15344 2013 AT Austria 100.0 Percentage of individuals PC_IND Last computer use: within last 3 months I_C3 Males, 16 to 19 years old M_Y16_19 NaN Hombre 16-19
15345 2014 AT Austria 100.0 Percentage of individuals PC_IND Last computer use: within last 3 months I_C3 Males, 16 to 19 years old M_Y16_19 NaN Hombre 16-19
15346 2015 AT Austria 99.0 Percentage of individuals PC_IND Last computer use: within last 3 months I_C3 Males, 16 to 19 years old M_Y16_19 NaN Hombre 16-19
... ... ... ... ... ... ... ... ... ... ... ... ... ...
6489 2013 UK United Kingdom 72.0 Percentage of individuals PC_IND Last computer use: within last 3 months I_C3 Females 55 to 74 years old F_Y55_74 NaN Mujer 55-74
6490 2014 UK United Kingdom 77.0 Percentage of individuals PC_IND Last computer use: within last 3 months I_C3 Females 55 to 74 years old F_Y55_74 NaN Mujer 55-74
6491 2015 UK United Kingdom 78.0 Percentage of individuals PC_IND Last computer use: within last 3 months I_C3 Females 55 to 74 years old F_Y55_74 NaN Mujer 55-74
6492 2017 UK United Kingdom 84.0 Percentage of individuals PC_IND Last computer use: within last 3 months I_C3 Females 55 to 74 years old F_Y55_74 NaN Mujer 55-74
6493 2017 XK Kosovo 13.0 Percentage of individuals PC_IND Last computer use: within last 3 months I_C3 Females 55 to 74 years old F_Y55_74 NaN Mujer 55-74

6038 rows × 13 columns

In [337]:
spain_xy = computer_use_cleaned[(computer_use_cleaned.geo=="Spain") & 
                                (computer_use_cleaned.time==2017)][["ind_type_new", "value", "Sexo"]].values
spain_xy[:, 0], spain_xy[:, 1], spain_xy[:, 2]
Out[337]:
(array(['16-19', '16-24', '16-29', '16-74', '20-24', '25-29', '25-54',
        '25-64', '55-74', '16-19', '16-24', '16-29', '16-74', '20-24',
        '25-29', '25-54', '25-64', '55-74'], dtype=object),
 array([95.0, 95.0, 93.0, 77.0, 95.0, 89.0, 83.0, 79.0, 55.0, 91.0, 92.0,
        89.0, 71.0, 93.0, 84.0, 82.0, 76.0, 44.0], dtype=object),
 array(['Hombre', 'Hombre', 'Hombre', 'Hombre', 'Hombre', 'Hombre',
        'Hombre', 'Hombre', 'Hombre', 'Mujer', 'Mujer', 'Mujer', 'Mujer',
        'Mujer', 'Mujer', 'Mujer', 'Mujer', 'Mujer'], dtype=object))
In [370]:
color_list = []

for item in spain_xy[:, 2]:
    if item == "Hombre": 
        color_list.append("#EF7949")
    else:
        color_list.append("#83539D")

color_list
Out[370]:
['#EF7949',
 '#EF7949',
 '#EF7949',
 '#EF7949',
 '#EF7949',
 '#EF7949',
 '#EF7949',
 '#EF7949',
 '#EF7949',
 '#83539D',
 '#83539D',
 '#83539D',
 '#83539D',
 '#83539D',
 '#83539D',
 '#83539D',
 '#83539D',
 '#83539D']
In [373]:
import plotly.express as px
import plotly.graph_objects as go



fig = px.violin(computer_use_cleaned[computer_use_cleaned.time==2017], x="ind_type_new", y="value",
             color='Sexo', hover_data=["geo"], points='all', 
             #facet_col="time", facet_col_wrap=2,
             height=600, width=1000,
             color_discrete_sequence=["#EF7949", "#83539D"],
            labels={
                 "value": "Porcentaje",
                 "ind_type_new": "Grupos de Edad",
             })
fig.add_trace(go.Scatter(x=spain_xy[:, 0], y=spain_xy[:, 1], mode = 'markers',
         marker_size = 20, name="Spain", opacity=0.8,
         marker_color=color_list,))
fig.write_html("computer_use.html")
fig.show()
In [339]:
df_internet_use = pd.read_csv("2022-05-02-ta_resdig_dig_intuse__isoc_ci_ifp_iu_individuals_internet used_female.csv")



df_internet_use_male = df_internet_use[(df_internet_use.ind_type.str.contains("Males")) & 
                     (df_internet_use.ind_type.str.contains("years old")) & 
                     (df_internet_use.indic_is=='Last Internet use: in last 3 months')]
df_internet_use_male["Sexo"] = "Hombre"

df_internet_use_female = df_internet_use[(df_internet_use.ind_type.str.contains("Females")) & 
                      (df_internet_use.ind_type.str.contains("years old")) & 
                      (df_internet_use.indic_is=='Last Internet use: in last 3 months')]
df_internet_use_female["Sexo"] = "Mujer"

internet_use_cleaned = pd.concat([df_internet_use_male, df_internet_use_female])
internet_use_cleaned["ind_type_new"] = internet_use_cleaned._ind_type.str.slice(start=-5, stop=8).str.replace("_", "-")
internet_use_cleaned
/tmp/ipykernel_727/2861403796.py:8: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

/tmp/ipykernel_727/2861403796.py:13: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Out[339]:
time _geo geo _flag indic_is _indic_is unit _unit ind_type _ind_type value Sexo ind_type_new
219659 2018 AL Albania NaN Last Internet use: in last 3 months I_IU3 Percentage of individuals PC_IND Males, 16 to 19 years old M_Y16_19 90.0 Hombre 16-19
219660 2019 AL Albania NaN Last Internet use: in last 3 months I_IU3 Percentage of individuals PC_IND Males, 16 to 19 years old M_Y16_19 96.0 Hombre 16-19
219661 2020 AL Albania NaN Last Internet use: in last 3 months I_IU3 Percentage of individuals PC_IND Males, 16 to 19 years old M_Y16_19 95.0 Hombre 16-19
219662 2021 AL Albania NaN Last Internet use: in last 3 months I_IU3 Percentage of individuals PC_IND Males, 16 to 19 years old M_Y16_19 97.0 Hombre 16-19
219663 2011 AT Austria NaN Last Internet use: in last 3 months I_IU3 Percentage of individuals PC_IND Males, 16 to 19 years old M_Y16_19 98.0 Hombre 16-19
... ... ... ... ... ... ... ... ... ... ... ... ... ...
206804 2020 UK United Kingdom NaN Last Internet use: in last 3 months I_IU3 Percentage of individuals PC_IND Females 55 to 74 years old F_Y55_74 92.0 Mujer 55-74
206805 2017 XK Kosovo NaN Last Internet use: in last 3 months I_IU3 Percentage of individuals PC_IND Females 55 to 74 years old F_Y55_74 61.0 Mujer 55-74
206806 2018 XK Kosovo NaN Last Internet use: in last 3 months I_IU3 Percentage of individuals PC_IND Females 55 to 74 years old F_Y55_74 72.0 Mujer 55-74
206807 2019 XK Kosovo NaN Last Internet use: in last 3 months I_IU3 Percentage of individuals PC_IND Females 55 to 74 years old F_Y55_74 80.0 Mujer 55-74
206808 2020 XK Kosovo NaN Last Internet use: in last 3 months I_IU3 Percentage of individuals PC_IND Females 55 to 74 years old F_Y55_74 89.0 Mujer 55-74

9481 rows × 13 columns

In [374]:
spain_xy = internet_use_cleaned[(internet_use_cleaned.geo=="Spain") & 
                                (internet_use_cleaned.time==2017)][["ind_type_new", "value", "Sexo"]].values


color_list = []

for item in spain_xy[:, 2]:
    if item == "Hombre": 
        color_list.append("#EF7949")
    else:
        color_list.append("#83539D")

color_list, spain_xy[:, 0], spain_xy[:, 1], spain_xy[:, 2]
Out[374]:
(['#EF7949',
  '#EF7949',
  '#EF7949',
  '#EF7949',
  '#EF7949',
  '#EF7949',
  '#EF7949',
  '#EF7949',
  '#EF7949',
  '#83539D',
  '#83539D',
  '#83539D',
  '#83539D',
  '#83539D',
  '#83539D',
  '#83539D',
  '#83539D',
  '#83539D'],
 array(['16-19', '16-24', '16-29', '16-74', '20-24', '25-29', '25-54',
        '25-64', '55-74', '16-19', '16-24', '16-29', '16-74', '20-24',
        '25-29', '25-54', '25-64', '55-74'], dtype=object),
 array([98.0, 98.0, 98.0, 85.0, 98.0, 98.0, 94.0, 90.0, 63.0, 96.0, 98.0,
        97.0, 84.0, 100.0, 95.0, 94.0, 89.0, 59.0], dtype=object),
 array(['Hombre', 'Hombre', 'Hombre', 'Hombre', 'Hombre', 'Hombre',
        'Hombre', 'Hombre', 'Hombre', 'Mujer', 'Mujer', 'Mujer', 'Mujer',
        'Mujer', 'Mujer', 'Mujer', 'Mujer', 'Mujer'], dtype=object))
In [376]:
fig = px.violin(internet_use_cleaned[internet_use_cleaned.time==2017], x="ind_type_new", y="value",
             color='Sexo', hover_data=["geo"], points='all', 
             #facet_col="time", facet_col_wrap=2,
             height=600, width=1000,
             color_discrete_sequence=["#EF7949", "#83539D"],             labels={
                 "value": "Porcentaje",
                 "ind_type_new": "Grupos de Edad",
             })
fig.add_trace(go.Scatter(x=spain_xy[:, 0], y=spain_xy[:, 1], mode = 'markers',
         marker_size = 20, name="Spain", opacity=0.8,
         marker_color=color_list,))
fig.write_html("internet_use.html")
fig.show()
In [ ]: